{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import opik\n",
    "opik.configure(use_local=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dotenv import load_dotenv\n",
    "load_dotenv()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import glob\n",
    "import subprocess\n",
    "\n",
    "from IPython.display import Markdown, display\n",
    "\n",
    "from llama_index.core import Settings\n",
    "from llama_index.llms.openai import OpenAI\n",
    "\n",
    "from llama_index.core import PromptTemplate\n",
    "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
    "from llama_index.core import VectorStoreIndex, ServiceContext, SimpleDirectoryReader\n",
    "\n",
    "\n",
    "from llama_index.core import Settings\n",
    "from llama_index.core import PromptTemplate\n",
    "from llama_index.core import SimpleDirectoryReader\n",
    "from llama_index.core import VectorStoreIndex\n",
    "from llama_index.core.storage.storage_context import StorageContext\n",
    "from llama_index.core.node_parser import CodeSplitter, MarkdownNodeParser\n",
    "from llama_index.llms.openai import OpenAI\n",
    "from llama_index.llms.anthropic import Anthropic\n",
    "from llama_index.core.indices.vector_store.base import VectorStoreIndex\n",
    "from llama_index.vector_stores.qdrant import QdrantVectorStore\n",
    "from llama_index.embeddings.fastembed import FastEmbedEmbedding\n",
    "from llama_index.core import Settings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Trace RAG calls "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.core import Settings\n",
    "from llama_index.core.callbacks import CallbackManager\n",
    "from opik.integrations.llama_index import LlamaIndexCallbackHandler\n",
    "\n",
    "# A callback handler tp automatically log all LlamaIndex operations to Opik\n",
    "opik_callback_handler = LlamaIndexCallbackHandler()\n",
    "\n",
    "# Integrate handler into LlamaIndex's settings\n",
    "Settings.callback_manager = CallbackManager([opik_callback_handler])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Step 2: Define helper functions\n",
    "def parse_github_url(url):\n",
    "    \"\"\"Extract owner and repo name from GitHub URL\"\"\"\n",
    "    pattern = r\"https://github\\.com/([^/]+)/([^/]+)\"\n",
    "    match = re.match(pattern, url)\n",
    "    return match.groups() if match else (None, None)\n",
    "\n",
    "def clone_repo(repo_url):\n",
    "    \"\"\"Clone a GitHub repository\"\"\"\n",
    "    return subprocess.run([\"git\", \"clone\", repo_url], check=True, text=True, capture_output=True)\n",
    "\n",
    "def parse_docs_by_file_types(ext, language, input_dir_path):\n",
    "    \"\"\"Parse documents based on file extension\"\"\"\n",
    "    files = glob.glob(f\"{input_dir_path}/**/*{ext}\", recursive=True)\n",
    "    \n",
    "    if len(files) > 0:\n",
    "        print(f\"Found {len(files)} files with extension {ext}\")\n",
    "        loader = SimpleDirectoryReader(\n",
    "            input_dir=input_dir_path, required_exts=[ext], recursive=True\n",
    "        )\n",
    "        docs = loader.load_data()\n",
    "        parser = (\n",
    "            MarkdownNodeParser()\n",
    "            if ext == \".md\"\n",
    "            else CodeSplitter.from_defaults(language=language)\n",
    "        )\n",
    "        nodes = parser.get_nodes_from_documents(docs)\n",
    "        print(f\"Processed {len(nodes)} nodes from {ext} files\")\n",
    "        return nodes\n",
    "    return []\n",
    "\n",
    "def setup_chat_engine(github_url, model_provider=\"OpenAI o3-mini\"):\n",
    "    \"\"\"\n",
    "    Set up the chat engine for a GitHub repository\n",
    "    Args:\n",
    "        github_url: URL of the GitHub repository\n",
    "        model_provider: 'openai' or 'anthropic'\n",
    "    \"\"\"\n",
    "    # Step 3: Process GitHub URL\n",
    "    owner, repo = parse_github_url(github_url)\n",
    "    if not owner or not repo:\n",
    "        raise ValueError(\"Invalid GitHub URL\")\n",
    "    \n",
    "    print(f\"\\nProcessing repository: {owner}/{repo}\")\n",
    "    input_dir_path = f\"./{repo}\"\n",
    "\n",
    "    # Step 4: Clone repository if it doesn't exist\n",
    "    if not os.path.exists(input_dir_path):\n",
    "        print(\"\\nCloning repository...\")\n",
    "        clone_repo(github_url)\n",
    "\n",
    "    # Step 5: Define file types to process\n",
    "    file_types = {\n",
    "        \".md\": \"markdown\",\n",
    "        \".py\": \"python\",\n",
    "        \".ipynb\": \"python\",\n",
    "        \".js\": \"javascript\",\n",
    "        \".ts\": \"typescript\"\n",
    "    }\n",
    "\n",
    "    # Step 6: Process all files\n",
    "    print(\"\\nProcessing files...\")\n",
    "    nodes = []\n",
    "    for ext, language in file_types.items():\n",
    "        nodes += parse_docs_by_file_types(ext, language, input_dir_path)\n",
    "\n",
    "    if not nodes:\n",
    "        raise ValueError(\"No files were processed from the repository\")\n",
    "\n",
    "    # Step 7: Setup embedding model\n",
    "    print(\"\\nSetting up embedding model...\")\n",
    "    # Settings.embed_model = FastEmbedEmbedding(model_name=\"BAAI/bge-base-en-v1.5\")\n",
    "\n",
    "    # Step 8: Create index\n",
    "    print(\"Creating vector index...\")\n",
    "    index = VectorStoreIndex(nodes=nodes)\n",
    "\n",
    "    # Step 9: Setup LLM and query engine\n",
    "    if model_provider == \"OpenAI o3-mini\":\n",
    "        Settings.llm = OpenAI(model=\"o3-mini\")\n",
    "    elif model_provider == \"Claude 3.7 Sonnet\":\n",
    "        Settings.llm = Anthropic(model=\"claude-3-7-sonnet-20250219\")\n",
    "    elif model_provider == \"Claude 3.5 Sonnet\":\n",
    "        Settings.llm = Anthropic(model=\"claude-3-5-sonnet-20240620\")\n",
    "\n",
    "    query_engine = index.as_query_engine(streaming=True, similarity_top_k=4)\n",
    "\n",
    "    # Step 10: Setup custom prompt template\n",
    "    qa_prompt_tmpl_str = (\n",
    "        \"Context information is below.\\n\"\n",
    "        \"---------------------\\n\"\n",
    "        \"{context_str}\\n\"\n",
    "        \"---------------------\\n\"\n",
    "        \"Given the context information above, you must always include a code snippet in your response.\\n\"\n",
    "        \"Think step by step to answer the query, and then provide a relevant code example that demonstrates the concept.\\n\"\n",
    "        \"Even if the question seems conceptual, translate your answer into a practical code example.\\n\"\n",
    "        \"If you don't know the answer, say 'I don't know!' but still provide a minimal code example of what you think might work.\\n\"\n",
    "        \"Query: {query_str}\\n\"\n",
    "        \"Answer: \"\n",
    "    )\n",
    "    qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)\n",
    "    query_engine.update_prompts(\n",
    "        {\"response_synthesizer:text_qa_template\": qa_prompt_tmpl}\n",
    "    )\n",
    "\n",
    "    print(\"\\nChat engine setup complete! Ready for questions.\")\n",
    "    return query_engine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_name = 'Claude 3.7 Sonnet'\n",
    "github_url = \"https://github.com/Lightning-AI/LitServe\"\n",
    "query_engine = setup_chat_engine(github_url, model_provider=model_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "response = query_engine.query(\"What is this repo about?\") \n",
    "print(response)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "from opik import Opik\n",
    "\n",
    "client = Opik()\n",
    "dataset = client.get_or_create_dataset(name=\"Eval Code Generation\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "from opik import track\n",
    "\n",
    "@track\n",
    "def my_llm_application(input: str) -> str:\n",
    "    response = query_engine.query(input)\n",
    "    return str(response)\n",
    "\n",
    "def evaluation_task(x):\n",
    "    return {\n",
    "        \"output\": my_llm_application(x['input'])\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "from opik.evaluation.metrics import base_metric, score_result\n",
    "from openai import OpenAI\n",
    "from typing import Any\n",
    "import json\n",
    "\n",
    "class LLMJudgeMetric(base_metric.BaseMetric):\n",
    "    def __init__(self, name: str = \"Code Quality Evaluation\", model_name: str = \"gpt-4o\"):\n",
    "        self.name = name\n",
    "        self.llm_client = OpenAI()\n",
    "        self.model_name = model_name\n",
    "        self.prompt_template = \"\"\"\n",
    "        You are an expert judge tasked with evaluating the quality of code generation by comparing the AI-generated code to the ground truth code.\n",
    "        \n",
    "        Evaluate how well the AI-generated code matches the ground truth code in terms of:\n",
    "        1. Correctness: Does the generated code implement the same functionality?\n",
    "        2. Completeness: Does the generated code include all necessary components?\n",
    "        3. Efficiency: Is the generated code similarly efficient in its approach?\n",
    "        4. If the generated code is not exactly the same as the ground truth, but the functionality is similar, then still give a high score.\n",
    "        5. Only focus on the code and the functionality, ignore the text.\n",
    "        \n",
    "        The format of your response should be a JSON object with no additional text or backticks that follows the format:\n",
    "        {{\n",
    "            \"score\": <score between 0 and 1>\n",
    "        }}\n",
    "        \n",
    "        Where:\n",
    "        - 0 means the generated code is completely different or incorrect\n",
    "        - 1 means the generated code is functionally equivalent to the ground truth\n",
    "        \n",
    "        AI-generated code: {output}\n",
    "        \n",
    "        Response:\n",
    "        \"\"\"\n",
    "    def score(self, output: str, **ignored_kwargs: Any):\n",
    "        \"\"\"\n",
    "        Score the output of an LLM.\n",
    "\n",
    "        Args:\n",
    "            output: The output of an LLM to score.\n",
    "            **ignored_kwargs: Any additional keyword arguments. This is important so that the metric can be used in the `evaluate` function.\n",
    "        \"\"\"\n",
    "        # Construct the prompt based on the output of the LLM\n",
    "        prompt = self.prompt_template.format(output=output)\n",
    "        # Generate and parse the response from the LLM\n",
    "        response = self.llm_client.chat.completions.create(\n",
    "            model=self.model_name,\n",
    "            messages=[{\"role\": \"user\", \"content\": prompt}]\n",
    "        )\n",
    "        response_dict = json.loads(response.choices[0].message.content)\n",
    "\n",
    "        response_score = float(response_dict[\"score\"])\n",
    "\n",
    "        return score_result.ScoreResult(\n",
    "            name=self.name,\n",
    "            value=response_score\n",
    "        )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "code_quality_metric = LLMJudgeMetric()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from opik.evaluation import evaluate\n",
    "\n",
    "evaluation = evaluate(\n",
    "    dataset=dataset,\n",
    "    task=evaluation_task,\n",
    "    experiment_name = model_name,\n",
    "    scoring_metrics=[code_quality_metric],\n",
    "    experiment_config={\n",
    "        \"model\": \"gpt-3.5-turbo\"\n",
    "    }\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env_gen",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
